This HTML report details an exploration of the nature of circadian genes in melanoma cell lines from the Broad Institute Depmap datasets using the depmap R package, cancer dependency data described by Tsherniak, Aviad, et al. “Defining a cancer dependency map.” Cell 170.3 (2017): 564-576.. The DepMap data enables us to look at CRISPR essentiality, copy number, mutation calls, and transcript expression for a large panel of cancer cell lines, including melanoma.
We will interrogate Depmap data using a curated list of genes that regulate the human circadian rhythm, which were taken from the publication The Circadian Clock Component RORA Increases Immunosurveillance in Melanoma by Inhibiting PD-L1 Expression, Liu, et al. 2024.
circadian_gene_list <- c(
"RORA", "PER3", "PER1", "CRY2", "RORB", "PER2", "NR1D2", "RORC", "CRY1",
"ARNTL", "NPAS2", "CLOCK", "NR1D1", "ARNTL2")
# driver_gene_list <- c(
# "BRAF", "NRAS", "NF1", "CDKN2A", "PTEN", "TP53", "TERT", "KIT", "GNAQ",
# "GNA11", "MITF", "MC1R", "CDK4")
Here, we connect to Ensembl via biomaRt (disabled by
eval=FALSE so it doesn’t automatically run each time). This
code retrieves gene metadata (e.g., gene names, coordinates) and saves
it as an .rds file.
## Connect to the Ensembl database
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
## get information for all human genes (to be used later)
getBM(
attributes = c(
"external_gene_name", "description", "entrezgene_id", "ensembl_gene_id",
"chromosome_name", "start_position", "end_position"),
mart = ensembl) %>%
dplyr::rename(gene_name = external_gene_name,
entrez_id = entrezgene_id,
ensembl_id = ensembl_gene_id,
chromosome = chromosome_name,
start = start_position,
end = end_position) %>%
dplyr::filter(stringr::str_length(gene_name) > 1,
stringr::str_length(chromosome) < 3) %>%
dplyr::mutate(description = gsub("\\[Source:.*", "", description)) %>%
as.data.frame() -> human_genes
saveRDS(human_genes, file = "./data/human_genes.rds")
human_genes <- readRDS(file = "./data/human_genes.rds")
We filter the previously loaded human_genes for only the
circadian and driver genes, then display those circadian genes in an
interactive datatable.
human_genes %>%
dplyr::filter(gene_name %in% circadian_gene_list) %>%
dplyr::arrange(gene_name) %>%
as.data.frame() -> circadian_genes
# human_genes %>%
# dplyr::filter(gene_name %in% driver_gene_list) %>%
# dplyr::arrange(gene_name) %>%
# as.data.frame() -> driver_genes
DT::datatable(circadian_genes)
We create an ExperimentHub object to pull relevant
DepMap datasets:
crispr: CRISPR-Cas9 essentiality (dependency) scores
copyNumber: copy number alterations per gene/cell line
TPM: transcript expression levels (RNA-seq)
mutationCalls: mutation data
metadata: cell-line metadata (including disease subtype).
## create ExperimentHub query object
eh <- ExperimentHub()
query(eh, "depmap")
#> ExperimentHub with 82 records
#> # snapshotDate(): 2024-10-24
#> # $dataprovider: Broad Institute
#> # $species: Homo sapiens
#> # $rdataclass: tibble
#> # additional mcols(): taxonomyid, genome, description,
#> # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> # rdatapath, sourceurl, sourcetype
#> # retrieve records with, e.g., 'object[["EH2260"]]'
#>
#> title
#> EH2260 | rnai_19Q1
#> EH2261 | crispr_19Q1
#> EH2262 | copyNumber_19Q1
#> EH2263 | RPPA_19Q1
#> EH2264 | TPM_19Q1
#> ... ...
#> EH7555 | copyNumber_22Q2
#> EH7556 | TPM_22Q2
#> EH7557 | mutationCalls_22Q2
#> EH7558 | metadata_22Q2
#> EH7559 | achilles_22Q2
metadata <- eh[["EH7558"]]
crispr <- eh[["EH7554"]]
copyNumber <- eh[["EH7555"]]
TPM <- eh[["EH7556"]]
mutationCalls <- eh[["EH7557"]]
We subset DepMap metadata for melanoma lines using keywords (“melanoma”) across relevant columns. We display the resulting set of melanoma cell lines in a datatable for easy scanning.
metadata %>%
dplyr::filter(grepl("melanoma", lineage_subtype) |
grepl("elanoma", Cellosaurus_NCIt_disease) |
grepl("elanoma", subtype_disease) |
grepl("elanoma", cell_line)) %>%
dplyr::select(-contains("issues")) %>%
as.data.frame() -> melanoma_metadata
DT::datatable(melanoma_metadata)
We visualize how dependent these cell lines are on each circadian gene using [plotly](https://plotly.com/r/. The dashed lines represent:
Red line: the mean dependency across all circadian genes in just melanoma lines.
Green line: the mean dependency across all lines and all genes in DepMap (global average).
crispr %>%
dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
gene_name %in% circadian_genes$gene_name,
!is.na(cell_line)) %>%
as.data.frame() -> crispr_df
crispr %>%
dplyr::filter(!is.na(dependency)) %>%
as.data.frame() -> crispr_global_dependency
crispr_df %>%
dplyr::filter(!is.na(dependency)) %>%
dplyr::group_by(gene_name) %>%
dplyr::summarize(mean_dependency = mean(dependency, na.remove = TRUE)) %>%
as.data.frame() -> crispr_gene_mean_dep
crispr_df %>%
dplyr::left_join(crispr_gene_mean_dep, by = "gene_name") %>%
dplyr::arrange(desc(mean_dependency)) %>%
dplyr::mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
dplyr::select(-c(gene, cell_line)) %>%
dplyr::left_join(metadata, by = "depmap_id") %>%
as.data.frame() -> crispr_gene_mean_dep_merged
crispr_gene_mean_dep_merged %>%
ggplot(aes(x = gene_name, y = dependency, color = subtype_disease)) +
geom_point(size = 0.75) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_hline(yintercept = mean(crispr_gene_mean_dep$mean_dependency, na.remove = TRUE),
color = "darkred", linetype = "dashed") +
geom_hline(yintercept = mean(crispr_global_dependency$dependency, na.remove = TRUE),
color = "darkgreen", linetype = "dashed") +
xlab("circadian rhythm-related genes") +
ggtitle("Circadian rhythm-related genes ranked by descending mean dependency score") -> p1
ggplotly(p1)
We reshape (pivot_wider) the data so that
rows = genes, columns = melanoma cell lines,
and values = CRISPR dependency scores. Then we call
pheatmap to visualize how essential each gene is across
multiple lines in a grid layout.
Interpretation: we don’t observe a strong pattern in dependency scores in circadian genes in melanoma cell lines
crispr_df %>%
dplyr::select(gene_name, cell_line, dependency) %>%
tidyr::pivot_wider(names_from = cell_line,
values_from = dependency) %>%
tibble::column_to_rownames(var = "gene_name") %>%
t() %>% as.data.frame() %>%
tibble::rownames_to_column(var = "cell_line") %>%
dplyr::left_join(melanoma_metadata, by = "cell_line") %>%
as.data.frame() -> crispr_ann_df
data.frame(status = crispr_ann_df$primary_or_metastasis,
subtype = crispr_ann_df$subtype_disease,
site = crispr_ann_df$sample_collection_site,
row.names = crispr_ann_df$cell_line) -> sample_col
crispr_ann_df %>%
dplyr::select(cell_line:RORB) %>%
tibble::column_to_rownames(var = "cell_line") %>%
t() %>%
pheatmap::pheatmap(
annotation_col = sample_col,
show_colnames = FALSE,
border_color = NA,
fontsize = 7,
main = paste0("Heatmap of dependency scores for circadian rhythm-related ",
"genes in melanoma cell lines"))
We check log2 copy-number values per gene. A dashed line at
y=1 (or y=0, depending on your scale) might
indicate a diploid reference. Higher or lower values suggest possible
amplification or deletion for these circadian genes in melanoma cells.
The genes are arranged by descending average copy number. For more
information how Depmap CNV is calculated, please refer to the DepMap
documentation.
Interpretation: RORC average copy number is highest in melanoma, but also displays the greatest variation. The other circadian-related genes display variation, but their mean CNV is close to 1
copyNumber %>%
dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
gene_name %in% circadian_genes$gene_name,
!is.na(cell_line)) %>%
as.data.frame() -> copy_number_df
copy_number_df %>%
dplyr::arrange(desc(log_copy_number)) %>%
dplyr::mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = log_copy_number, fill = gene_name)) +
geom_violin() +
geom_boxplot(width = 0.25) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
geom_hline(yintercept = 1, color = "black", linetype = "dashed") +
ggtitle(paste0("Log10 copy-number for circadian rhythm-related genes in",
" melanoma cell lines"))
We pivot from long to wide, building a matrix of TPM expression with
genes as rows and cell lines as columns. Then we optionally annotate
columns with metadata (metastatic status,
disease subtype) and generate a pheatmap to
see expression clustering.
Interpretation: Expression of PER2, RORA, RORB and RORC appears to be very low in all melanome cell lines. NPAS expression is the most elevated, while the remaining genes display expression somewhere in between.
TPM %>%
dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
gene_name %in% circadian_genes$gene_name,
!is.na(cell_line)) %>%
as.data.frame() -> tpm_df
tpm_df %>%
dplyr::select(gene_name, cell_line, rna_expression) %>%
tidyr::pivot_wider(names_from = cell_line,
values_from = rna_expression) %>%
tibble::column_to_rownames(var = "gene_name") %>%
t() %>% as.data.frame() %>%
tibble::rownames_to_column(var = "cell_line") %>%
dplyr::left_join(melanoma_metadata, by = "cell_line") %>%
as.data.frame() -> tpm_ann_df
data.frame(status = tpm_ann_df$primary_or_metastasis,
subtype = tpm_ann_df$subtype_disease,
site = tpm_ann_df$sample_collection_site,
row.names = tpm_ann_df$cell_line) -> sample_col
tpm_ann_df %>%
dplyr::select(cell_line:RORB) %>%
tibble::column_to_rownames(var = "cell_line") %>%
t() %>%
pheatmap::pheatmap(
annotation_col = sample_col,
fontsize = 7,
border_color = NA,
show_colnames = FALSE,
main = paste0("Heatmap of log10 gene expression for circadian rhythm-related ",
"genes in melanoma cell lines"))
We join CRISPR dependency data with expression (TPM) for
the same gene/cell line pairs, then plot expression vs. dependency. The
stat_cor(method = "spearman") call adds a correlation value
to see if higher expression correlates with lower (or higher)
essentiality.
Interpretation: We don’t observe a signfificant direct correlation between any circadian rhythm gene expression and CRISPR dependency scores.
Note: Normally, we would expect a relationship where increased gene expression displays increased dependency, if such a gene dependency exists.
tpm_df %>%
dplyr::select(-c(gene, cell_line, entrez_id)) %>%
dplyr::left_join(crispr_df, by = c("depmap_id", "gene_name")) %>%
dplyr::filter(!is.na(cell_line)) %>%
ggplot(aes(x = dependency, y = rna_expression, colour = gene_name)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE, formula = y ~ poly(x, 1, raw = TRUE),
color = "black", linetype = "dashed", fill = "gray") +
ggpubr::stat_cor(method = "spearman", label.x = -0.5, label.y = 6) +
ggtitle(paste0("Correlation between circadian-rhythm gene expression and ",
"CRISPR dependency\nscores in Depmap melanoma cell lines")) +
facet_wrap(~gene_name, ncol = 3)
We look at mutationCalls table, combine with metadata, and keep only those entries with a circadian gene in melanoma cell lines.
mutationCalls %>%
dplyr::left_join(melanoma_metadata, by = "depmap_id") %>%
dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
gene_name %in% circadian_genes$gene_name,
!is.na(cell_line)) %>%
as.data.frame() -> mutation_calls_df
We create a balloon plot to see how mutation classification (e.g., “Missense”, “Nonsense”, “Splice site”) differs between primary vs. metastatic cell lines.
table(mutation_calls_df$var_class, mutation_calls_df$primary_or_metastasis) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap elanoma circadian gene mutation type by metastatic status")
We create a balloon plot to see how mutation classification (e.g., “Missense”, “Nonsense”, “Splice site”) differs between primary vs. metastatic cell lines.
table(mutation_calls_df$var_class, mutation_calls_df$gene_name) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap melanoma circadian gene mutation type by gene")
A balloon plot that partitions each bar by the circadian gene mutated, so you can see which genes tend to have which var_class.
table(mutation_calls_df$var_annotation, mutation_calls_df$primary_or_metastasis) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap melanoma circadian gene mutation annotation by metastatic status")
We plot the variant annotation to see how it splits between primary vs metastatic lines.
table(mutation_calls_df$var_annotation, mutation_calls_df$gene_name) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap melanoma circadian gene mutation annotations by gene")
We create a custom column mutation_type that concatenates ref_allele and alt_allele (e.g., C_to_T) to highlight known mutational signatures. The bar chart categorizes them by primary vs. metastatic. This often reveals a high frequency of UV-associated C>T transitions in melanoma.
UV Exposure: In primary cutaneous melanomas, C to T (and symmetrically G to A) transitions at dipyrimidine sites are the prototypical “UV signature.” Given the strong etiological of melanoma link with UV radiation, it is no surprise these transitions dominate.
mutation_calls_df %>%
dplyr::mutate(mutation_type = paste0(ref_allele, "_to_", alt_allele)) %>%
as.data.frame() -> mutation_calls_df2
table(mutation_calls_df2$mutation_type, mutation_calls_df2$primary_or_metastasis) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap melanoma circadian gene mutation transitions by metastatic status")
mutation_calls_df %>%
dplyr::mutate(mutation_type = paste0(ref_allele, "_to_", alt_allele)) %>%
as.data.frame() -> mutation_calls_df2
table(mutation_calls_df2$mutation_type, mutation_calls_df2$gene_name) %>%
as.data.frame() %>%
ggballoonplot(fill = "value") +
scale_fill_gradientn(colors = my_cols) +
ggtitle("Depmap melanoma circadian gene mutation transitions by gene")
This exploration underscores the utility of DepMap data and R-based visualization workflows to generate hypotheses about circadian gene roles in melanoma cell-line survival, copy number changes, and potentially UV-signature–associated mutations.
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Europe/Brussels
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] DT_0.33 pheatmap_1.0.12 plotly_4.10.4
#> [4] biomaRt_2.62.1 ExperimentHub_2.14.0 AnnotationHub_3.14.0
#> [7] BiocFileCache_2.14.0 dbplyr_2.5.0 BiocGenerics_0.52.0
#> [10] depmap_1.20.0 ggpubr_0.6.0 ggrepel_0.9.6
#> [13] stringr_1.5.1 viridis_0.6.5 viridisLite_0.4.2
#> [16] ggplot2_3.5.1 tibble_3.2.1 tidyr_1.3.1
#> [19] dplyr_1.1.4
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.2.3 gridExtra_2.3 httr2_1.1.0
#> [4] rlang_1.1.5 magrittr_2.0.3 compiler_4.4.2
#> [7] RSQLite_2.3.9 mgcv_1.9-1 png_0.1-8
#> [10] vctrs_0.6.5 pkgconfig_2.0.3 crayon_1.5.3
#> [13] fastmap_1.2.0 backports_1.5.0 XVector_0.46.0
#> [16] labeling_0.4.3 rmarkdown_2.29 UCSC.utils_1.2.0
#> [19] purrr_1.0.4 bit_4.5.0.1 xfun_0.50
#> [22] zlibbioc_1.52.0 cachem_1.1.0 GenomeInfoDb_1.42.3
#> [25] jsonlite_1.8.9 progress_1.2.3 blob_1.2.4
#> [28] broom_1.0.7 prettyunits_1.2.0 R6_2.6.1
#> [31] bslib_0.9.0 stringi_1.8.4 RColorBrewer_1.1-3
#> [34] car_3.1-3 jquerylib_0.1.4 Rcpp_1.0.14
#> [37] knitr_1.49 IRanges_2.40.1 Matrix_1.7-2
#> [40] splines_4.4.2 tidyselect_1.2.1 rstudioapi_0.17.1
#> [43] abind_1.4-8 yaml_2.3.10 curl_6.2.0
#> [46] lattice_0.22-6 Biobase_2.66.0 withr_3.0.2
#> [49] KEGGREST_1.46.0 evaluate_1.0.3 xml2_1.3.6
#> [52] Biostrings_2.74.1 pillar_1.10.1 BiocManager_1.30.25
#> [55] filelock_1.0.3 carData_3.0-5 stats4_4.4.2
#> [58] generics_0.1.3 BiocVersion_3.20.0 S4Vectors_0.44.0
#> [61] hms_1.1.3 munsell_0.5.1 scales_1.3.0
#> [64] glue_1.8.0 lazyeval_0.2.2 tools_4.4.2
#> [67] data.table_1.16.4 ggsignif_0.6.4 grid_4.4.2
#> [70] crosstalk_1.2.1 AnnotationDbi_1.68.0 colorspace_2.1-1
#> [73] nlme_3.1-167 GenomeInfoDbData_1.2.13 Formula_1.2-5
#> [76] cli_3.6.4 rappdirs_0.3.3 gtable_0.3.6
#> [79] rstatix_0.7.2 sass_0.4.9 digest_0.6.37
#> [82] farver_2.1.2 htmlwidgets_1.6.4 memoise_2.0.1
#> [85] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
#> [88] mime_0.12 bit64_4.6.0-1